This notebook is for preparing input files for pyclone-vi. The same files with a minor modification will be used as an input for FastClone.
suppressPackageStartupMessages({
library(tidyverse)
library(readxl)
#library(cDriver) # Calculate CCF, https://github.com/hanasusak/cDriver/
})
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tumor-clone-inference")
input_dir <- file.path(analysis_dir, "input")
data_dir <- file.path(root_dir, "data")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")
maf_files_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal", "results")
#tmb_input_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal", "input")
#scratch_dir <- file.path(root_dir, "scratch")
# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
maf_file <- file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz")
tmb_file <- file.path(maf_files_dir, "tmb_vaf_genomic.tsv")
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv") # file from add-sample-distribution module
nautilus_dec_file <- file.path(input_dir, "deceased_samples.xlsx")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
#tmb_all_file <- file.path(tmb_input_dir, "snv-mutation-tmb-coding.tsv")
# EXAMPLE
# But to replace with dir with cns data inferred by CNVkit
# PT_Z4BF2NSB_dir <- file.path(input_dir, "cnvkit_data_example/PT_Z4BF2NSB")
cns_dir <- file.path(input_dir, "cnvkit_data")
# File path to results directory
results_dir <-
file.path(analysis_dir, "results")
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# File path to input directory
pyclonevi_input_dir <-
file.path(analysis_dir, "results", "pyclone-vi-input")
if (!dir.exists(pyclonevi_input_dir)) {
dir.create(pyclonevi_input_dir)
}
# File path to input directory
fastclone_input_dir <-
file.path(analysis_dir, "results", "fastclone-input")
if (!dir.exists(pyclonevi_input_dir)) {
dir.create(pyclonevi_input_dir)
}
# File path to plot directory
pyclone_plots_dir <-
file.path(analysis_dir, "plots", "pyclone-vi")
if (!dir.exists(pyclone_plots_dir)) {
dir.create(pyclone_plots_dir)
}
source(paste0(root_dir, "/figures/scripts/theme.R"))
# Read in histologies file and filter for the pbta cohort
pbta <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
filter(!(experimental_strategy == "RNA-Seq")) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
sample_id, aliquot_id,
tumor_fraction, tumor_ploidy)
# Read in tmb_all file
tmb_df <- readr::read_tsv(tmb_file, guess_max = 100000, show_col_types = FALSE)
# Read maf
# Keep both synonymous and non-synoymous mutations for the current analysis
maf_file_df <- vroom::vroom(maf_file, delim = "\t", col_names = TRUE)
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 15533924 Columns: 133── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (59): Hugo_Symbol, Center, NCBI_Build, Chromosome, Strand, Variant_Classification, Variant_Type, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2, dbSNP_R...
dbl (32): Entrez_Gene_Id, Start_Position, End_Position, t_depth, t_ref_count, t_alt_count, n_depth, n_ref_count, n_alt_count, ALLELE_NUM, DISTANCE, STRAND_VEP, PICK...
lgl (42): dbSNP_Val_Status, Tumor_Validation_Allele1, Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2, Verification_Status, V...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Process maf
maf_df <- maf_file_df %>%
filter(Tumor_Sample_Barcode %in% pbta$Kids_First_Biospecimen_ID#,
#Variant_Classification %in% maf_nonsynonymous
) %>%
dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) %>%
left_join(pbta) %>%
left_join(tmb_df) %>%
# Calculate VAF
dplyr::mutate(VAF = t_alt_count / (t_ref_count + t_alt_count),
# Concatenate gene and protein information
gene_protein = paste(Hugo_Symbol, HGVSp_Short, sep = "_")) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
sample_id, aliquot_id, Chromosome,
Start_Position, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
t_ref_count, t_alt_count,
tumor_fraction, tumor_ploidy, VAF, gene_protein, Hugo_Symbol,
mutation_count, tmb) %>%
# Remove hypermutants
filter(!tmb >= 10 #,
# There are alterations with high number of read counts.
# We will exclude those with >1000 for now.
# !t_alt_count >= 1000,
#!t_ref_count >= 1000
) %>%
# Add `normal_cn`: Total copy number of segment in healthy tissue.
# For autosome this will be two and male sex chromosomes one.
# See: https://github.com/Roth-Lab/pyclone-vi
mutate(normal_cn = case_when(grepl("chrY", Chromosome) ~ "1",
TRUE ~ "2")) %>%
# Calculate CCF
# CCF() %>%
# using formula for CCF calculation
mutate(CCF = (2*t_alt_count)/ (t_ref_count + t_alt_count)) #%>%
Joining with `by = join_by(Kids_First_Biospecimen_ID)`Joining with `by = join_by(Hugo_Symbol, Chromosome, Start_Position, Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2, Kids_First_Biospecimen_ID, HGVSc, HGVSp, HGVSp_Short, t_ref_count, t_alt_count, Kids_First_Participant_ID, cg_id, tumor_descriptor, sample_id, aliquot_id, tumor_fraction, tumor_ploidy)`
#write_tsv(file.path(scratch_dir, "maf_pbta.tsv"))
# Number of samples in the `maf_df`
maf_samples <- print(length(unique(maf_df$Kids_First_Participant_ID)))
[1] 116
# Read patient list
genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
select(!c(cancer_group, experimental_strategy)) %>%
left_join(maf_df, by = c("Kids_First_Participant_ID")) %>% # create unique identifiers
dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":"),
cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
cg_id_kids = str_replace(cg_id_kids, "/|-", "_"),
cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
cg_id = str_replace(cg_id, "/|-", "_"),
cg_id = str_replace_all(cg_id, " ", "_"),
sample_id = paste(tumor_descriptor, Kids_First_Biospecimen_ID, sep = ":")) #%>%
#filter(!is.na(VAF) #,
#!VAF == 0 #,
#!is.na(tmb)
# )
# Number of samples in the `genomic_paired_df`
genomic_samples <- print(length(unique(genomic_paired_df$Kids_First_Participant_ID)))
[1] 119
# Let's count #specimens per sample
# We will remove any samples with less than 2 specimens as phylogenies require at least 3 taxa
kids_specimens_n_df <- genomic_paired_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_specimens_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_specimens_number = n) %>%
filter(!kids_specimens_number <= 2) %>%
left_join(genomic_paired_df, by = c("Kids_First_Participant_ID")) %>%
#left_join(maf_df) %>%
filter(!is.na(Chromosome))
# Let's confirm and add the number of timepoints per sample
# In case we lost any from filtering hypermutants and high reads/alteration
timepoints_n_df <- kids_specimens_n_df %>%
select(Kids_First_Participant_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_timepoints_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_timepoints_number = n) %>%
filter(!kids_timepoints_number <= 1)
df <- timepoints_n_df %>%
left_join(kids_specimens_n_df, by = c("Kids_First_Participant_ID")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny.tsv"))
# Number of samples in the `genomic_paired_df` with > 3 biospecimens and >2 timepoints
samples_with_3bs <- print(length(unique(df$Kids_First_Participant_ID)))
[1] 34
samples_with_3bs <- print(unique(df$Kids_First_Participant_ID))
[1] "PT_04V47WFC" "PT_0DWRY9ZX" "PT_19GCSK2S" "PT_1H2REHT2" "PT_23NZGSRJ" "PT_3KM9W8S8" "PT_75HRTX4S" "PT_8GN3TQRM" "PT_962TCBVR" "PT_99S5BPE3"
[11] "PT_9S6WMQ92" "PT_AQWDQW27" "PT_C1RDBCVM" "PT_CWXSP19D" "PT_D6AJHDST" "PT_DVXE38EX" "PT_EQX0VT4F" "PT_GTHZF21E" "PT_HFQNKP5X" "PT_HHG37M6W"
[21] "PT_HJMP6PH2" "PT_K8ZV7APT" "PT_KBFM551M" "PT_KTRJ8TFY" "PT_KZ56XHJT" "PT_MDWPRDBT" "PT_MNSEJCDM" "PT_N8W26H19" "PT_NJQ26FHN" "PT_NK8A49X5"
[31] "PT_S4YNE17X" "PT_TP6GS00H" "PT_Z4BF2NSB" "PT_ZZRBX5JT"
# List with samples eligible for phylogeny
# I added the information about to use for phylogenetic inferences
list_df <- df %>%
select(Kids_First_Participant_ID) %>%
unique() %>%
mutate(somatic_germline_phylogeny = case_when(grepl("PT_KZ56XHJT|PT_KTRJ8TFY", Kids_First_Participant_ID) ~ "yes",
TRUE ~ "not_yet")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_list.tsv"))
# Remove large files
rm(genomic_paired_df, kids_specimens_n_df)
nautilus_dec_df <- read_excel(nautilus_dec_file) %>%
right_join(df, by = c("sample_id", "aliquot_id", "tumor_descriptor")) %>%
write_tsv(file.path(results_dir, "nautilus_dec.tsv"))
list_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>%
unique() %>%
write_tsv(file.path(results_dir, "nautilus_dec_list.tsv"))
# Make list with samples with >2 biospecimens at Deceased timepoint
dec_n_df <- df %>%
filter(tumor_descriptor == "Deceased") %>%
select(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
# dplyr::mutate(kids_dec_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_deceased_bs_number = n) %>%
filter(!kids_deceased_bs_number <= 1) %>%
write_tsv(file.path(results_dir, "kids_dec_multiple_bs_list.tsv"))
###-----------------------------------------------------
# let's look into PT_3KM9W8S8
#PT_3KM9W8S8_df <- nautilus_dec_df %>%
# filter(Kids_First_Participant_ID == "PT_3KM9W8S8",
# Kids_First_Biospecimen_ID == "BS_2NQXY528") %>%
# select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>%
# unique() %>%
# write_tsv(file.path(results_dir, "nautilus_dec_list.tsv"))
#bs_id <- print(unique(PT_3KM9W8S8_df$Kids_First_Biospecimen_ID))
We need to generate the input files according to the method’s template. Phylogenetic methods require at least 2 samples per tumor site (multiregional sampling per anatomical site). Here, we will consider kids samples with more than 2 timepoints with one or more biospecimens. We will compare later differences in samples with single vs multiple biospecimens.
# Create pyclone df for all samples
pyclone_all_samples_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id,
cg_id_kids, mutation_id, sample_id,
tumor_descriptor, Chromosome, Start_Position,
Reference_Allele, Tumor_Seq_Allele2, t_ref_count, t_alt_count,
normal_cn, tumor_fraction, mutation_count) %>%
# change names to match input requirements
dplyr::rename("ref_counts" = "t_ref_count",
"alt_counts" = "t_alt_count",
"tumour_content" = "tumor_fraction") %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor,
cg_id_kids, mutation_id, sample_id, ref_counts,
alt_counts, normal_cn, tumour_content, mutation_count)
samples_pyclone <- print(unique(pyclone_all_samples_df$Kids_First_Participant_ID))
[1] "PT_04V47WFC" "PT_0DWRY9ZX" "PT_19GCSK2S" "PT_1H2REHT2" "PT_23NZGSRJ" "PT_3KM9W8S8" "PT_75HRTX4S" "PT_8GN3TQRM" "PT_962TCBVR" "PT_99S5BPE3"
[11] "PT_9S6WMQ92" "PT_AQWDQW27" "PT_C1RDBCVM" "PT_CWXSP19D" "PT_D6AJHDST" "PT_DVXE38EX" "PT_EQX0VT4F" "PT_GTHZF21E" "PT_HFQNKP5X" "PT_HHG37M6W"
[21] "PT_HJMP6PH2" "PT_K8ZV7APT" "PT_KBFM551M" "PT_KTRJ8TFY" "PT_KZ56XHJT" "PT_MDWPRDBT" "PT_MNSEJCDM" "PT_N8W26H19" "PT_NJQ26FHN" "PT_NK8A49X5"
[31] "PT_S4YNE17X" "PT_TP6GS00H" "PT_Z4BF2NSB" "PT_ZZRBX5JT"
# Remove large files
rm(nautilus_dec_df)
# I will test one HGG dataset for now
# PT_Z4BF2NSB
#PT_Z4BF2NSB_DF <- pyclone_all_samples_df %>%
# filter(Kids_First_Participant_ID == "PT_Z4BF2NSB")
# Create and save pyclone_input!
pyclone_input <- data_list_bind %>%
# Rename to match input format
# To figure out if the assignment is correct
dplyr::rename("major_cn" = "cn1",
"minor_cn" = "cn2") %>%
left_join(pyclone_all_samples_df) %>%
filter(!is.na(major_cn),
!is.na(minor_cn)) %>%
dplyr::mutate(mutation_id = paste(mutation_id, major_cn, minor_cn, ref_counts, alt_counts, sep = ":")) %>%
select(Kids_First_Participant_ID, tumor_descriptor, cg_id_kids, mutation_id, sample_id, ref_counts,
alt_counts, normal_cn, major_cn, minor_cn, tumour_content, mutation_count) %>%
# To ensure there are no duplicated entries in the dataframe
distinct() %>%
filter(!is.na(Kids_First_Participant_ID))
Joining with `by = join_by(Kids_First_Biospecimen_ID)`Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
samples_pyclone <- print(unique(pyclone_input$Kids_First_Participant_ID))
[1] "PT_KTRJ8TFY" "PT_KZ56XHJT" "PT_Z4BF2NSB"
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
#sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 10)[,9])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
# Save input file for pyclone-vi
pyclone_fname <- paste0(pyclonevi_input_dir, "/", sample_name[x], ".tsv")
print(pyclone_fname)
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
#select(-c(Kids_First_Participant_ID)) %>%
write_tsv(file.path(pyclone_fname))
# Save input file for FastClone
fastclone_fname <- paste0(fastclone_input_dir, "/", sample_name[x], ".tsv")
print(fastclone_fname)
fastclone_input_subset <- pyclone_input %>%
dplyr::rename("var_counts" = "alt_counts") %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
#select(-c(Kids_First_Participant_ID)) %>%
write_tsv(file.path(fastclone_fname))
}
}
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
# To find the position of duplicate elements in x, use this:
# duplicated(pyclone_input)
# Extract duplicate elements:
# pyclone_input[duplicated(pyclone_input)]
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) %>%
mutate(tumor_descriptor = color_names)
# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$tumor_descriptor
# Define timepoints
timepoints = c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
select(-c(Kids_First_Participant_ID)) #%>%
# mutate(tumor_descriptor = factor(tumor_descriptor),
# tumor_descriptor = fct_relevel(tumor_descriptor, timepoints)) %>%
#arrange(tumor_descriptor, sample_id)
# Make this reproducible
set.seed(2023)
# Define label for plots
Timepoint <- factor(x = pyclone_input_subset$tumor_descriptor, levels = timepoints)
#######################
# Create bxp ref_counts
p <- print(ggplot(pyclone_input_subset, aes(sample_id, ref_counts, color = Timepoint)) +
geom_jitter(width = 0.15, size = 0.7, alpha = 0.6) +
ggplot2::geom_boxplot(color = "black",
size = 0.25,
alpha = 0,
coef = 0) + # remove whiskers
theme_Publication() +
scale_color_manual(values = palette,
breaks = sort(names(palette))) +
#rotate() +
theme(axis.text.x = element_text(angle = 90)) +
stat_summary(fun.y=mean,shape=1,col='black',geom='point') +
labs(title = sample_name[x],
x = "sample_id",
y = "ref_counts",
color = "Timepoint"))
# Save the plot
ggsave(filename = paste0(sample_name[x], "-ref_counts.pdf"),
path = pyclone_plots_dir,
width = 6,
height = 5,
device = "pdf",
useDingbats = FALSE)
#######################
# Create bxp alt_counts
p <- print(ggplot(pyclone_input_subset, aes(sample_id, alt_counts, color = Timepoint)) +
geom_jitter(width = 0.15, size = 0.7, alpha = 0.6) +
ggplot2::geom_boxplot(color = "black",
size = 0.25,
alpha = 0,
coef = 0) + # remove whiskers
theme_Publication() +
scale_color_manual(values = palette,
breaks = sort(names(palette))) +
#rotate() +
theme(axis.text.x = element_text(angle = 90)) +
stat_summary(fun.y=mean,shape=1,col='black',geom='point') +
labs(title = sample_name[x],
x = "sample_id",
y = "alt_counts",
color = "Timepoint"))
# Save the plot
ggsave(filename = paste0(sample_name[x], "-alt_counts.pdf"),
path = pyclone_plots_dir,
width = 6,
height = 5,
device = "pdf",
useDingbats = FALSE)
}
}
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
select(-c(Kids_First_Participant_ID))
# Define parameters for function
ylim = max(pyclone_input_subset$mutation_count)
# Rename legend for timepoints
Timepoint <- factor(pyclone_input_subset$tumor_descriptor)
# Plot stacked barplot
print(ggplot(pyclone_input_subset, aes(x = sample_id,
y = mutation_count,
fill = Timepoint)) +
geom_col(position = position_stack(reverse = TRUE)) +
geom_bar(stat = "identity", width = 0.5) +
scale_fill_manual(values = palette, breaks = sort(names(palette))) +
theme_Publication() +
theme(axis.text.x = element_text(angle = 85,
hjust = 1,
vjust = 1)) +
labs(title = paste(sample_name)) +
labs(x = "sample_id", y = "Total Mutations") +
ylim(0, ylim))
}
}
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.6.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggthemes_4.2.4 readxl_1.4.3 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
[10] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] utf8_1.2.4 generics_0.1.3 stringi_1.8.1 hms_1.1.3 digest_0.6.33 magrittr_2.0.3 evaluate_0.23 timechange_0.2.0
[9] fastmap_1.1.1 cellranger_1.1.0 rprojroot_2.0.4 fansi_1.0.5 scales_1.2.1 clonevol_0.99.11 textshaping_0.3.7 cli_3.6.1
[17] rlang_1.1.2 crayon_1.5.2 bit64_4.0.5 munsell_0.5.0 withr_2.5.2 yaml_2.3.7 tools_4.3.1 parallel_4.3.1
[25] tzdb_0.4.0 colorspace_2.1-0 vctrs_0.6.4 R6_2.5.1 lifecycle_1.0.4 bit_4.0.5 vroom_1.6.4 ragg_1.2.6
[33] pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.4 glue_1.6.2 systemfonts_1.0.5 xfun_0.41 tidyselect_1.2.0 rstudioapi_0.15.0
[41] knitr_1.45 farver_2.1.1 htmltools_0.5.7 rmarkdown_2.25 labeling_0.4.3 compiler_4.3.1